In [1]:
    
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
    
In [2]:
    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder()
ae = AstorError()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
a = pipe.transform(a)
print(ae.get_summary())
    
    
In [3]:
    
coverage_general = []
number_templates_general = []
avg_dist_general = []
avg_sim_general = []
labels = []
for value in [1200, 500, 100, 10, 1]:
    print ('Calculating for value: ',value)
    a = Features(notebook_objs)
    gastf = GetASTFeatures()
    rbn = ResampleByNode()
    gi = GetImports()
    fe = FeatureEncoding()
    ke = KmeansEncoder(n_clusters = value)
    ae = AstorError()
    pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
    a = pipe.transform(a)
    avg_dist_general.append(ae.average_distance())
    avg_sim_general.append(ae.average_similarity())
    coverage_general.append(ae.get_percent_coverage())
    number_templates_general.append(ae.get_unique_templates())
    labels.append(ke.get_labels())
    
    
In [5]:
    
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 20)
x = [1200, 500, 100, 10, 1]
fig, axes = plt.subplots(2,2)
n2, = axes[0,0].plot(x, avg_dist_general, label = 'Average Distance (All calls are the same)')
axes[0,0].set_title("Average edit distance")
axes[0,0].set_xlabel('Number of templates')
axes[0,0].set_ylabel('Average edit distance')
n2, = axes[0,1].plot(x, avg_sim_general, label = 'Average Similarity (All calls are the same)')
axes[0,1].set_title("Average matching characters")
axes[0,1].set_xlabel('Number of templates')
axes[0,1].set_ylabel('Average matching characters')
n2, = axes[1,0].plot(x, coverage_general, label = 'Coverage (All calls are the same)')
axes[1,0].set_title("Coverage of templates")
axes[1,0].set_xlabel('Number of templates')
axes[1,0].set_ylabel('Coverage of templates')
n2, = axes[1,1].plot(x, number_templates_general, label = 'Number of Templates (All calls are the same)')
axes[1,1].set_title("Number of templates")
axes[1,1].set_xlabel('Number of templates')
axes[1,1].set_ylabel('Number of templates')
    
    Out[5]:
    
In [35]:
    
def get_vec_sizes(v):
    r = {}
    for el in v:
        if el not in r:
            r[el] = 0
        r[el] += 1
    return (list(r.values()))
def num_one(v):
    total = 0
    for el in v:
        if el == 1:
            total += 1
    return total
    
In [56]:
    
import numpy as np
plt.rcParams['figure.figsize'] = (10, 5)
[np.median(get_vec_sizes(v)) for v in labels]
[num_one(get_vec_sizes(v)) for v in labels]
n1, = plt.plot(x[:-1],[np.median(get_vec_sizes(v)) for v in labels[:-1]], label='Median size of cluster' )
n2, = plt.plot(x,[num_one(get_vec_sizes(v)) for v in labels], label='Number of clusters with one element')
plt.legend(handles=[n1,n2])
plt.xlabel('Number of clusters')
plt.ylabel('Number of examples')
    
    Out[56]:
    
In [71]:
    
plt.rcParams['figure.figsize'] = (10, 30)
fig, axes = plt.subplots(len(labels))
for i in range(len(labels)):
    axes[i].hist(get_vec_sizes(labels[i]), bins = 100)
    axes[i].set_xlabel('Number of data points')
    axes[i].set_ylabel('Number of clusters')
    title = str(x[i]) + " Clusters"
    if x[i] == 1:
        title = "1 Cluster"
    axes[i].set_title(title)
    
    
In [ ]: